# Direct outputs not eligible for public release

# GLOBAL SETTINGS --------------------------------------------------------------

options(
    scipen = 999,
    digits = 16,
    max.print = .Machine$integer.max,
    show.error.locations = TRUE,
    warn = 1
)

RNGkind("L'Ecuyer-CMRG")
seed <- 818675309L
set.seed(seed) # setting main seed

# PACKAGES ---------------------------------------------------------------------
library(data.table)
library(fixest)

# PACKAGE SETTINGS -------------------------------------------------------------

# data.table
setDTthreads(threads = 1L)
options(datatable.print.class = TRUE, datatable.print.keys = TRUE)
# so that printing the data.table also shows the variable type on top

# fixest
setFixest_nthreads(nthreads = 1L) # for OLS

# BEGIN FILE -------------------------------------------------------------------

# Read in data for 2003, arbitrary year all of the annual files and combine
# - use "_slim" files as they aren't as wide / more memory-efficient
lottery_dt_2003 <-
    readRDS("~/population-annual-data/lottery_data_slim_2003.rds")
setorderv(lottery_dt_2003, c("tin", "tax_yr"))

# Make event time
lottery_dt_2003[, event_time := as.integer(tax_yr - win_yr)]

# Now flag the age in win year, and those aged 21-64 in their own win year
lottery_dt_2003[
    ,
    age_in_win_year := sum(as.integer(tax_yr == win_yr) * age),
    by = .(tin)
]

# Subset to event times -7 to 5
# - will end up being -4 to 5 since balanced panel starts in 1999
lottery_dt_2003 <-
    copy(
        lottery_dt_2003[
            win_yr == 2003 &
                between(age_in_win_year, 21L, 64L, incbounds = TRUE) &
                between(event_time, -7L, 5L, incbounds = TRUE)
        ]
    )

# Will estimate FD relative to -2 for the normalization
# so don't make a dummy for that event time
current_cols <- copy(colnames(lottery_dt_2003))
lottery_dt_2003[, event_time_pre4 := as.integer(event_time == -4L)]
lottery_dt_2003[, event_time_pre3 := as.integer(event_time == -3L)]
lottery_dt_2003[, event_time_pre1 := as.integer(event_time == -1L)]
lottery_dt_2003[, event_time_0 := as.integer(event_time == 0L)]
lottery_dt_2003[, event_time_1 := as.integer(event_time == 1L)]
lottery_dt_2003[, event_time_2 := as.integer(event_time == 2L)]
lottery_dt_2003[, event_time_3 := as.integer(event_time == 3L)]
lottery_dt_2003[, event_time_4 := as.integer(event_time == 4L)]
lottery_dt_2003[, event_time_5 := as.integer(event_time == 5L)]
dummy_cols <- setdiff(colnames(lottery_dt_2003), current_cols)

fd_cohort_formula <-
    paste0(
        "db_w2_wages",
        "~",
        paste0(dummy_cols, collapse = "+"),
        "|",
        "factor(age)"
    )

fd_cohort_coefs <-
    coef(feols(as.formula(fd_cohort_formula), data = lottery_dt_2003))

fd_cohort_coefs <-
    rbindlist(
        list(
            data.table(
                event_time = setdiff(-4L:5L, -2L),
                estimate = fd_cohort_coefs
            ),
            data.table(event_time = -2L, estimate = 0)
        ),
        use.names = TRUE
    )
setorderv(fd_cohort_coefs, "event_time")

fd_cohort_coefs[, estimate_type := "coefficient"]

# Now we build the levels up from the above contrasts
# - just to add back the population mean so the intercept is correct
fd_cohort_levels <- copy(fd_cohort_coefs)

# Add back the population mean
fd_cohort_levels[, estimate := estimate + mean(lottery_dt_2003$db_w2_wages)]

# Pin illustrative values for figure
# - pre_line_val: value in omitted year, -2
# - post_line_val: mean in the first five years post win
pre_line_val <-
    round(
        fd_cohort_levels[event_time == -2L]$estimate,
        digits = 2L
    )
post_line_val <-
    round(
        mean(fd_cohort_levels[between(event_time, 1L, 5L)]$estimate),
        digits = 2L
    )

fd_cohort_levels[event_time <= 0L, pre_line := pre_line_val]
fd_cohort_levels[event_time >= 0L, post_line := post_line_val]

fd_cohort_levels[, estimate_type := "level"]

# Combine into result set
fd_cohort_coefs[, c("pre_line", "post_line") := NA]
fd_cohort_results <-
    rbindlist(list(fd_cohort_levels, fd_cohort_coefs), use.names = TRUE)

saveRDS(fd_cohort_results, "~/estimation-output/single-cohort-fd-results.rds")

# Housekeeping
lottery_dt_2003 <- NULL
current_cols <- NULL
dummy_cols <- NULL
fd_cohort_formula <- NULL
fd_cohort_coefs <- NULL
fd_cohort_levels <- NULL
pre_line_val <- NULL
post_line_val <- NULL
fd_cohort_results <- NULL
rm(
    lottery_dt_2003,
    current_cols,
    dummy_cols,
    fd_cohort_formula,
    fd_cohort_coefs,
    fd_cohort_levels,
    pre_line_val,
    post_line_val,
    fd_cohort_results
)